# Load time series libraries
library(sweep) # Broom-style tidiers for the forecast package
library(forecast) # Forecasting models and predictions package
library(tidyquant) # Loads tidyverse, financial pkgs, used to get data
library(timetk) # Functions working with time series
# Library for scraping Bitcoin data
library(Quandl)
# Library for data manipulation
library(tidyverse)
The quandl_tidy function is a wrapper around the Quandl function that returns a cleaner tibble.
Quandl.api_key("5ydoG6gTCKjgzDpJp_1s") # 3GAtxPrAgoah7PyADPGy
quandl_tidy <- function(code, name) {
df <- Quandl(code) %>%
mutate(code = code, name = name) %>%
rename(date = Date, value = Value) %>%
arrange(date) %>%
as_tibble()
return(df)
}
bitcoin_price <- Quandl("BCHARTS/BITSTAMPUSD") %>%
arrange(Date) %>%
as_tibble()
colnames(bitcoin_price) <- c("date", "open", "high", "low", "close", "volume_btc", "volume_currency", "weighted_price")
After importing the Bitcoin price data, we narrow our focus to the Bitcoin daily open prices. We do so because the next day’s opening price is the price that earliest trade can be processed at.
Note that there are some days where the opening price is $0. In those cases, we will use the previous day’s opening price to fill these values.
bitcoin_price_tbl <-
bitcoin_price %>%
select(date, open) %>%
mutate(open = ifelse(open == 0, NA, open)) %>% # replace 0s with NAs
fill(open, .direction = c("down")) %>% # fill NAs with previous day's opening price
as_tibble()
head(bitcoin_price_tbl)
# # A tibble: 6 x 2
# date open
# <date> <dbl>
# 1 2011-09-13 5.8
# 2 2011-09-14 5.58
# 3 2011-09-15 5.12
# 4 2011-09-16 4.82
# 5 2011-09-17 4.87
# 6 2011-09-18 4.87
Split the data into training and testing data to test accuracy of forecasts. For now, we will set the training data period from January 2018 to June 2020, and the forecast period from July to November 2020.
We set the earliest date for the training data to be January 2018 as we are doing feature engineering for a separate model. Since the feature engineering involves the scraping of news articles related to Bitcoin, a limitation we face is backward navigation for articles that were published a long time ago. Thus, we decided that 2018 would be a reasonable time frame that provides us with enough news articles to build the corpus, at the same time making sure that we have a sufficient number of data points to train our forecasting models.
# Keep Jul to Nov 2020 as test data
bitcoin_price_test_tbl <-
bitcoin_price_tbl %>%
filter(date >= "2020-07-01" & date <= "2020-11-30")
# Keep Jan 2018 to Jun 2020 as train data
bitcoin_price_train_tbl <-
bitcoin_price_tbl %>%
filter(date >= "2018-01-01" & date <= "2020-06-30")
Sources:
We begin by visualising the daily opening prices for Bitcoin over the time period of the data.
# Plot daily Bitcoin opening prices
bitcoin_price_train_tbl %>%
ggplot(aes(date, open)) +
geom_line(col = palette_light()[6]) +
geom_ma(ma_fun = SMA, n = 30, size = 1, col = palette_light()[2]) +
theme_tq() +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
scale_y_continuous(labels = scales::dollar_format()) +
labs(title = "Bitcoin Prices: Jan 2018 through Jun 2020")
Convert Bitcoin daily opening price data from tbl to a ts object.
# Convert from tbl to ts
bitcoin_price_ts <- tk_ts(bitcoin_price_train_tbl, start = bitcoin_price_train_tbl$date[1], freq = 365)
# Check that ts-object has a timetk index
# This will be important when using sw_sweep() later
has_timetk_idx(bitcoin_price_ts)
# [1] TRUE
Use the auto.arima() function from the forecast package to model the time series
# Model using auto.arima
fit_arima <- auto.arima(bitcoin_price_ts, lambda = "auto")
fit_arima
# Series: bitcoin_price_ts
# ARIMA(3,1,1)
# Box Cox transformation: lambda= 1.629794
#
# Coefficients:
# ar1 ar2 ar3 ma1
# -0.7657 -0.0364 0.064 0.7055
# s.e. 0.1286 0.0428 0.035 0.1251
#
# sigma^2 estimated as 13546935818: log likelihood=-11917.22
# AIC=23844.44 AICc=23844.5 BIC=23868.51
Tidy the model using sweep functions:
sw_tidy(): Get model coefficientssw_glance(): Get model description and training set accuracy metricssw_augment(): Get model residuals# sw_tidy - Get model coefficients
sw_tidy(fit_arima)
# # A tibble: 4 x 2
# term estimate
# <chr> <dbl>
# 1 ar1 -0.766
# 2 ar2 -0.0364
# 3 ar3 0.0640
# 4 ma1 0.705
# sw_glance - Get model description and training set accuracy measures
sw_glance(fit_arima) %>%
glimpse()
# Rows: 1
# Columns: 12
# $ model.desc <chr> "ARIMA(3,1,1)"
# $ sigma <dbl> 116391.3
# $ logLik <dbl> -11917.22
# $ AIC <dbl> 23844.44
# $ BIC <dbl> 23868.51
# $ ME <dbl> -5.288059
# $ RMSE <dbl> 359.3997
# $ MAE <dbl> 223.211
# $ MPE <dbl> -0.1446963
# $ MAPE <dbl> 2.814793
# $ MASE <dbl> 0.06113198
# $ ACF1 <dbl> -0.0176028
# sw_augment - get model residuals
sw_augment(fit_arima, timetk_idx = TRUE)
# # A tibble: 912 x 4
# index .actual .fitted .resid
# <date> <dbl> <dbl> <dbl>
# 1 2018-01-01 13880 13871. 3460.
# 2 2018-01-02 13394. 13877. -194076.
# 3 2018-01-03 14671. 13431. 507698.
# 4 2018-01-04 15156. 14590. 240122.
# 5 2018-01-05 15144. 15108. 15334.
# 6 2018-01-06 16928. 15240. 752607.
# 7 2018-01-07 17142. 16789. 163118.
# 8 2018-01-08 16174. 17163. -451146.
# 9 2018-01-09 15000. 16323. -579995.
# 10 2018-01-10 14404. 15015. -257484.
# # … with 902 more rows
# plot residuals
sw_augment(fit_arima, timetk_idx = TRUE) %>%
ggplot(aes(x = index, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, color = "red") +
labs(title = "Residual Diagnostic") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
theme_tq()
Use the forecast() function to forecast Bitcoin prices for the next month.
# Forecast next 30 days
fcast_arima <- forecast(fit_arima, h = 153)
# Check if forecast has timetk index
has_timetk_idx(fcast_arima)
# [1] TRUE
# tidy forecast output
fcast_tbl <- sw_sweep(fcast_arima, timetk_idx = TRUE)
fcast_tbl
# # A tibble: 1,065 x 7
# index key open lo.80 lo.95 hi.80 hi.95
# <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2018-01-01 actual 13880 NA NA NA NA
# 2 2018-01-02 actual 13394. NA NA NA NA
# 3 2018-01-03 actual 14671. NA NA NA NA
# 4 2018-01-04 actual 15156. NA NA NA NA
# 5 2018-01-05 actual 15144. NA NA NA NA
# 6 2018-01-06 actual 16928. NA NA NA NA
# 7 2018-01-07 actual 17142. NA NA NA NA
# 8 2018-01-08 actual 16174. NA NA NA NA
# 9 2018-01-09 actual 15000. NA NA NA NA
# 10 2018-01-10 actual 14404. NA NA NA NA
# # … with 1,055 more rows
Compare ARIMA forecasts with actual Bitcoin prices from July 2020 to Nov 2020.
# Visualize the forecast with ggplot
fcast_tbl %>%
ggplot(aes(x = index, y = open, color = key)) +
# 95% CI
geom_ribbon(aes(ymin = lo.95, ymax = hi.95),
fill = "#D5DBFF", color = NA, size = 0) +
# 80% CI
geom_ribbon(aes(ymin = lo.80, ymax = hi.80, fill = key),
fill = "#596DD5", color = NA, size = 0, alpha = 0.8) +
# Prediction
geom_line() +
# geom_point() +
# Actuals
geom_line(aes(x = date, y = open), color = palette_light()[[1]], data = bitcoin_price_test_tbl) +
# geom_point(aes(x = date, y = price), color = palette_light()[[1]], data = bitcoin_price_test_tbl) +
# Aesthetics
labs(title = "Bitcoin Prices Forecast", x = "", y = "Opening Price",
subtitle = "ARIMA(3,1,1)") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
scale_y_continuous(labels = scales::dollar_format()) +
scale_color_tq() +
scale_fill_tq() +
theme_tq()
Calculate errors for forecast Bitcoin prices.
# Investigate test error
error_tbl <- left_join(bitcoin_price_test_tbl, fcast_tbl, by = c("date" = "index")) %>%
rename(actual = open.x, pred = open.y) %>%
select(date, actual, pred) %>%
mutate(
error = actual - pred,
error_pct = error / actual
)
# Calculate test error metrics
test_residuals <- error_tbl$error
test_error_pct <- error_tbl$error_pct * 100 # Percentage error
me <- mean(test_residuals, na.rm=TRUE)
rmse <- mean(test_residuals^2, na.rm=TRUE)^0.5
mae <- mean(abs(test_residuals), na.rm=TRUE)
mape <- mean(abs(test_error_pct), na.rm=TRUE)
mpe <- mean(test_error_pct, na.rm=TRUE)
tibble(me, rmse, mae, mape, mpe) %>% glimpse()
# Rows: 1
# Columns: 5
# $ me <dbl> 2814.422
# $ rmse <dbl> 3789.792
# $ mae <dbl> 2821.617
# $ mape <dbl> 20.60938
# $ mpe <dbl> 20.53032
The modeltime package allows us to easily build several time series models using the tidymodels framework and compare the predictions of all models on the same plot.
library(tidymodels)
library(modeltime)
library(lubridate)
# Model 1: auto_arima
model_fit_arima_no_boost <- arima_reg() %>%
set_engine(engine = "auto_arima") %>%
fit(open ~ date, data = bitcoin_price_train_tbl)
# Model 2: arima with XGBoost
model_fit_arima_boosted <- arima_boost(
min_n = 2,
learn_rate = 0.015
) %>%
set_engine(engine = "auto_arima_xgboost") %>%
fit(open ~ date + as.numeric(date) + factor(day(date), ordered = F),
data = bitcoin_price_train_tbl)
# Model 3: error-trend-season (ets)
model_fit_ets <- exp_smoothing() %>%
set_engine(engine = "ets") %>%
fit(open ~ date, data = bitcoin_price_train_tbl)
# Model 4: prophet
model_fit_prophet <- prophet_reg() %>%
set_engine(engine = "prophet") %>%
fit(open ~ date, data = bitcoin_price_train_tbl)
# Model 5: prophet with XGBoost
model_fit_prophet_boosted <- prophet_boost(
seasonality_daily = FALSE,
seasonality_weekly = FALSE,
seasonality_yearly = FALSE,
changepoint_range = 0.90,
trees = 300,
mtry = 0.50,
min_n = 30,
learn_rate = 0.15
) %>%
set_engine("prophet_xgboost") %>%
fit(open ~ date, data = bitcoin_price_train_tbl)
# Add fitted models to a Modeltime Table
models_tbl <- modeltime_table(
model_fit_arima_no_boost,
model_fit_arima_boosted,
model_fit_ets,
model_fit_prophet,
model_fit_prophet_boosted
)
# Calibrate the model to test data
calibration_tbl <- models_tbl %>%
modeltime_calibrate(new_data = bitcoin_price_test_tbl)
# Visualise forecasted values for all models
calibration_tbl %>%
modeltime_forecast(
new_data = bitcoin_price_test_tbl,
actual_data = rbind(bitcoin_price_train_tbl,
bitcoin_price_test_tbl)
) %>%
plot_modeltime_forecast(
.legend_max_width = 25, # For mobile screens
.interactive = TRUE
)
From the chart above, the predicted values generated by the normal prophet model stands out the most due to its variability as compared to the straight line predictions for the rest of the models.
calibration_tbl %>%
modeltime_accuracy() %>%
table_modeltime_accuracy(
.interactive = TRUE
)
Comparing the accuracy of the 5 time series models we trained, the prophet models have the lowest MAPE scores. However, the prices of Bitcoin are unlikely to exhibit a steady rise as predicted by the prophet_boost() model. Thus, we will treat the base prophet model (model 4) as the most ideal time series model.
# Keep prophet forecasts
prophet_forecasts <-
calibration_tbl %>%
modeltime_forecast(
new_data = bitcoin_price_test_tbl,
actual_data = rbind(bitcoin_price_train_tbl,
bitcoin_price_test_tbl)
) %>% filter(.model_id == 4) %>%
select(.index, .value) %>%
rename(date = .index, open = .value)
# Append the last row of train data to prospect forecasts
prophet_forecasts <-
rbind(tail(bitcoin_price_train_tbl, 1),
prophet_forecasts)
# New column that contains 1 if Bitcoin prices increase, 0 otherwise
prophet_forecasts_sign <-
prophet_forecasts %>%
mutate(pred_signal = ifelse(lead(prophet_forecasts$open, 2) > lead(prophet_forecasts$open),
1, 0)
) %>%
filter(date >= "2020-07-01")
prophet_forecasts_sign
# # A tibble: 153 x 3
# date open pred_signal
# <date> <dbl> <dbl>
# 1 2020-07-01 10371. 0
# 2 2020-07-02 10420. 1
# 3 2020-07-03 10406. 1
# 4 2020-07-04 10497. 1
# 5 2020-07-05 10557. 1
# 6 2020-07-06 10559. 1
# 7 2020-07-07 10605. 1
# 8 2020-07-08 10617. 0
# 9 2020-07-09 10647. 1
# 10 2020-07-10 10616. 1
# # … with 143 more rows
Export the prophet prediction signals to be compared with the predictions of other models.
# Export date and prophet forecast signals
prophet_forecasts_sign %>%
select(date, pred_signal) %>%
rename(prophet_pred_signal = pred_signal) %>%
filter(date < "2020-11-29") %>%
write.csv(file="../data/prophet_signals_open.csv", row.names = FALSE)
We simulate the execution of a trading strategy where we buy Bitcoin on the days the prophet model predicts a day-on-day increase in Bitcoin prices and sell otherwise.
# Add last row of train data to test data
bitcoin_price_test_returns <-
rbind(tail(bitcoin_price_train_tbl, 1),
bitcoin_price_test_tbl)
# Compute future return and future return sign for test data
bitcoin_price_test_returns <-
bitcoin_price_test_returns %>% tq_mutate(select = open,
mutate_fun = periodReturn,
period = 'daily',
type = 'arithmetic',
col_rename = 'future_return') %>%
mutate(future_return = lead(future_return, n=2, default = 0),
future_return_sign = as.factor(ifelse(future_return > 0, 1, 0))) %>%
filter(date >= "2020-07-01")
# Merge test data and prophet prediction signal
test_and_prophet <-
bitcoin_price_test_returns %>%
bind_cols(select(prophet_forecasts_sign, "pred_signal")) %>%
mutate(return_buyhold = cumprod(1 + future_return),
return_prophet = cumprod(1 + lead(future_return) * pred_signal)) %>%
filter(date < "2020-11-29") # remove NAs
# Visualisation of prophet strategy vs buy-and-hold strategy
test_and_prophet %>%
select(date, return_buyhold, return_prophet) %>%
gather(key = "strategy", value = "returns", -date) %>%
ggplot(aes(x=date, y = returns)) +
geom_line(aes(color = strategy)) +
theme_light()